

************************************************************
************************************************************
***      Retrieving Panel Data from the SOEP             ***
***      4: Creating an Episode (Spell) File             ***
***         Josef Brderl, Volker Ludwig                 *** 	   
***                  May 2012                            *** 	   
************************************************************
************************************************************

* Data: SOEP 1984-2010 v27

* LEAVING UNEMPLOYMENT
* We extract unemployment episodes from ARTKALEN
* The aim is to do a repeated events analysis

clear
clear matrix 
set more off
version 12

global pfad1 "C:\SOEP\V27\"        //directory of original data 
global pfad2 "C:\SOEP\work\"       //working directory


***************************************************************
*** 1. STEP: CREATING THE EPISODE DATASET FROM ARTKALEN *******
***************************************************************
cd $pfad1
use artkalen.dta, clear

* Some cosmetics
ren  persnr  id
ren  begin   entry
ren  end     exit
sort id entry

* Descriptive information
tab1 spelltyp zensor                                   //5=unemployment
list id spelltyp entry exit zensor if id==203,  table  //a volatile life course

* Keep only unemployment episodes
keep if spelltyp==5

* Occurence Dependence: # of previous unemployment episodes
bysort id (entry): gen nprev  = _n-1     //includes left-censored episodes

* Drop left censored observations
keep if zensor<=3 

cd $pfad2
keep id entry exit zensor nprev
save master.dta, replace                 //save master file


/*******************************************************************************
* For a competing risks analysis one would also need the subsequent status:
gen   unemp = spelltyp==5                  //indicator for unemployment episode 
bysort id (entry): gen nunemp=sum(unemp)   //generate running number of unemp. episode
drop if    nunemp==0                       //drop episodes before first unemp. episode
* Problem: there can be more than one subsequent status
drop if spelltyp==99                                 //drop gaps (Lcke)
bysort id nunemp (entry): gen  help1 = exit[1] + 1   //exit month (+1) unemp. episode
bysort id nunemp        : gen  help2 =              ///1 indicates a subsequent episode
                               (entry<=help1 & exit>=help1) & unemp==0
bysort id nunemp        : egen help3 = total(help2)   //total # of subsequent episodes
tab help3 if unemp==1, miss
* We drop episodes with multiple subsequent episodes, because we cannot decide
drop if help3 > 1
* Now we retreive the subsequent status
keep if unemp==1 | help2==1                         //keep only the subsequent episode
bysort id nunemp (help2): gen subst = spelltyp[_n+1] 
recode subst 1=1 2 3 15=2 4 13=3 8=4 14=5 10 7=6 6=7 9 11 12=8 .=0
label define substlbl 0 "censored" 1 "full-time" 2 "part-time" 3 "apprenticeship" ///
                      4 "school, university" 5 "training program"                 ///
					  6 "parental leave, houseman" 7 "retirement" 8 "other" 
label values subst substlbl
* Keep only unemployment episodes
keep if unemp==1
tab subst, miss
*******************************************************************************/


***************************************************************
*** 2. STEP: PULL VARIABLES FROM PPFAD                  *******
***************************************************************
cd $pfad1
use ppfad.dta, clear             //load PPFAD
ren persnr id                    //our person identifier is "id"
keep id psample sex gebjahr gebmonat     ///these vars one should pull always
        loc1989 migback                   //these vars are optional
cd $pfad2
merge 1:m id using master.dta    //merge with (multiple) episodes from master.dta
drop if _merge==1                //drop persons without unemployment episode
drop _merge                      //_merge has to be deleted before the next merge


***************************************************************
*** 3. STEP: GENERATING IMPORTANT VARIABLES             *******
***************************************************************

* Creating IDs and counters
sort id entry
gen                    epiid  = _n  //episode id
bysort id (entry): gen epinr  = _n  //episodes numbered consecutively (within person)
bysort id        : gen epitot = _N  //total number of unemployment episodes per person
tab epinr                  //number of episodes (N*T)
tab epitot  if epinr==1    //number of episodes (N)
tab zensor                 //number of failing episodes (N*T)

* Imputing birthmonth (many missings)
* tab gebmonat if epinr==1               //many missing birthmonths
set seed 13243567
gen rmonth = 1+int(12*runiform())        //generate random months 
replace gebmonat=rmonth if gebmonat<1    //impute random months

save master.dta, replace                 //replace master file


/*********************************************************************
* If we do not want to use time-varying covariates, we are now done.
* We could start with continuous-time EHA:
cd $pfad2
use master.dta, clear
gen dur = exit - entry          //duration in months (assuming 15th day start and end)
replace dur=0.5 if dur==0       //there must be no zero durations
stset dur, failure(zensor==1) 
sts graph, survival tmax(36) ci                            
sts graph, hazard tmax(36) ci width(2)                         
sts graph, survival by(sex) ci tmax(36)
sts graph if loc1989==1|loc1989==2, survival by(loc1989) ci tmax(36)
*********************************************************************/


***************************************************************
*** 4. STEP: EPISODE-SPLITTING (EPISODE-MONTH FILE)     *******
***************************************************************
gen     dur1 = exit-entry+1      //number of month splits (months reported unemployed)
expand  dur1                     //copying the episodes

* Creating time-yarying covariates
bysort epiid: gen     t = _n                       //time variable
gen                   d = 0                        //failure indicator
bysort epiid (t): replace d=1 if zensor==1 & t==_N

* Current date in SOEP-metric (Jan 1983 == 1)
gen month83 = entry + (t-1)         //current date in SOEP time-metric

* Transforming all dates to Stata time-metric (Jan 1960 == 0, Jan 1983 == 276 [12*23])
replace entry   = entry + 275
replace exit    = exit  + 275
gen     month60 = entry + (t-1)     //current date in Stata time-metric

* The current year
gen  year = int((month83-1)/12) + 1983
* tab year, gen(y)

* The current month (Dez 2009 = 324)
gen month = 0
forvalues j=1/12 { 
   forvalues k=`j'(12)324 {
       quietly replace month = `j' if month83 == `k'
   }
}

cd $pfad2
save master.dta, replace                 //replace master file


***************************************************************
***   4. STEP: MATCHING TIME-VARYING INFO FROM $PGEN    *******
***************************************************************
* In the following merge is done by "id" and "year".
* More exact would be to merge "id" and "month60" (the interview month in PGEN).
* But then one would have to interpolate somehow to the non-interview months!
* Further, one could pull hhnrakt from $PGEN and then merge info from $HGEN!

local year=1984                       //preparing $PGEN and saving these files
foreach wave in a b c d e f g h i j k l m n o p q r s t u v w x y z ba{
	cd $pfad1
	use `wave'pgen.dta, clear
	ren persnr id                      //person identifier should be "id"
	gen year=`year'                    //this will be our wave identifier
	ren month intmonth
	gen month60=ym(year,intmonth)      //interview month in Stata metric
	ren `wave'bilzeit educ
	keep id year educ                  //add "month60" if used for merge
	cd $pfad2
	save `wave'work.dta, replace       //save the prepared files
	local year=`year'+1
}
use awork.dta, clear                  //pool all years
foreach wave in b c d e f g h i j k l m n o p q r s t u v w x y z ba{
	append using `wave'work.dta
}
cd $pfad2
merge 1:m id year using master.dta   //merge with (multiple) episodes from master.dta
tab year _merge                      //_merge==2 if no interview in this year
drop if _merge==1                    //drop person-years without unemployment episode
drop _merge                          //_merge has to be deleted before the next merge


***************************************************************
***    STEP: MATCHING TIME-VARYING INFO FROM BIOs       *******
***************************************************************
* Now one could also merge info from BIOMARSM (marriage) or BIOBIRTH (kids).
* Ideally one would produce for each person a monthly dataset (from Jan 1983
* to Dec 2009) with #kids for instance and merge.


***************************************************************
*** 5. STEP: SAVE FINAL DATA SET                        *******
***************************************************************
drop gebjahr gebmonat rmonth dur1 month83    //drop unnecessary variables

compress
sort  epiid t
order id epiid epinr t d month year month60 entry exit zensor epitot

save ANALYSISDATASET2.dta, replace              //CHOOSE YOUR NAME!!!!

* Delete auxiliary files
foreach wave in a b c d e f g h i j k l m n o p q r s t u v w x y z ba{
	capture erase `wave'work.dta
}
erase master.dta


* Checking some cases
list id epiid epinr t d month year month60 entry exit zensor epitot ///
          if id==203 | id==5604002, sepby(epinr) table

		  
